Clean data
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Sweedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
datpass <- dat %>%
filter(category=="Passenger") %>%
select(survived, gender, class, age) %>%
na.omit()
Descriptives
# Breakdown of passengers by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(gender)) %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passengers by Class and Gender
|
Class
|
Gender
|
Count
|
Percent
|
|
1st Class
|
Female
|
153
|
43.71
|
|
1st Class
|
Male
|
197
|
56.29
|
|
2nd Class
|
Female
|
112
|
38.36
|
|
2nd Class
|
Male
|
180
|
61.64
|
|
3rd Class
|
Female
|
216
|
30.47
|
|
3rd Class
|
Male
|
493
|
69.53
|
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passenger Nationalities
|
Nationality
|
Count
|
Percent
|
|
English
|
1037
|
42.36
|
|
Irish
|
361
|
14.75
|
|
American
|
246
|
10.05
|
|
Other - Multiple
|
116
|
4.74
|
|
Swedish
|
99
|
4.04
|
|
Syrian/Lebanese
|
86
|
3.51
|
|
Finnish
|
58
|
2.37
|
|
Scottish
|
49
|
2.00
|
|
French
|
44
|
1.80
|
|
Italian
|
41
|
1.67
|
|
Canadian
|
39
|
1.59
|
|
Bulgarian
|
33
|
1.35
|
|
Croatian
|
28
|
1.14
|
|
Belgian
|
26
|
1.06
|
|
Norwegian
|
26
|
1.06
|
|
Channel Islander
|
25
|
1.02
|
|
Welsh
|
23
|
0.94
|
|
Swiss
|
22
|
0.90
|
|
German
|
14
|
0.57
|
|
Danish
|
11
|
0.45
|
|
Spanish
|
9
|
0.37
|
|
Australian
|
7
|
0.29
|
|
Polish
|
6
|
0.25
|
|
South African
|
5
|
0.20
|
|
Bosnian
|
4
|
0.16
|
|
Hong Kongese
|
4
|
0.16
|
|
Dutch
|
3
|
0.12
|
|
Greek
|
3
|
0.12
|
|
Lithuanian
|
3
|
0.12
|
|
Uruguayan
|
3
|
0.12
|
|
Chinese
|
2
|
0.08
|
|
Portuguese
|
2
|
0.08
|
|
Slovenian
|
2
|
0.08
|
|
Austrian
|
1
|
0.04
|
|
Cape Verdean
|
1
|
0.04
|
|
Egyptian
|
1
|
0.04
|
|
Guyanese
|
1
|
0.04
|
|
Haitian
|
1
|
0.04
|
|
Hungarian
|
1
|
0.04
|
|
Japanese
|
1
|
0.04
|
|
Latvian
|
1
|
0.04
|
|
Mexican
|
1
|
0.04
|
|
Sweedish
|
1
|
0.04
|
|
Turkish
|
1
|
0.04
|
# Breakdown of passenger nationalities by class (all)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passenger Nationalities by Class (All)
|
Class
|
Nationality
|
Count
|
Percent
|
|
1st Class
|
American
|
195
|
57.35
|
|
1st Class
|
English
|
51
|
15.00
|
|
1st Class
|
Canadian
|
27
|
7.94
|
|
1st Class
|
Other - Multiple
|
14
|
4.12
|
|
1st Class
|
French
|
10
|
2.94
|
|
1st Class
|
Irish
|
6
|
1.76
|
|
1st Class
|
Swiss
|
6
|
1.76
|
|
1st Class
|
German
|
5
|
1.47
|
|
1st Class
|
Scottish
|
5
|
1.47
|
|
1st Class
|
Spanish
|
4
|
1.18
|
|
1st Class
|
Swedish
|
4
|
1.18
|
|
1st Class
|
Uruguayan
|
3
|
0.88
|
|
1st Class
|
Belgian
|
2
|
0.59
|
|
1st Class
|
Italian
|
2
|
0.59
|
|
1st Class
|
Channel Islander
|
1
|
0.29
|
|
1st Class
|
Dutch
|
1
|
0.29
|
|
1st Class
|
Egyptian
|
1
|
0.29
|
|
1st Class
|
Mexican
|
1
|
0.29
|
|
1st Class
|
Norwegian
|
1
|
0.29
|
|
1st Class
|
Polish
|
1
|
0.29
|
|
2nd Class
|
English
|
145
|
51.06
|
|
2nd Class
|
Other - Multiple
|
25
|
8.80
|
|
2nd Class
|
American
|
24
|
8.45
|
|
2nd Class
|
Channel Islander
|
12
|
4.23
|
|
2nd Class
|
Irish
|
12
|
4.23
|
|
2nd Class
|
French
|
11
|
3.87
|
|
2nd Class
|
Scottish
|
8
|
2.82
|
|
2nd Class
|
Finnish
|
6
|
2.11
|
|
2nd Class
|
Swedish
|
6
|
2.11
|
|
2nd Class
|
Canadian
|
5
|
1.76
|
|
2nd Class
|
South African
|
4
|
1.41
|
|
2nd Class
|
Spanish
|
4
|
1.41
|
|
2nd Class
|
Danish
|
3
|
1.06
|
|
2nd Class
|
Italian
|
3
|
1.06
|
|
2nd Class
|
Lithuanian
|
2
|
0.70
|
|
2nd Class
|
Swiss
|
2
|
0.70
|
|
2nd Class
|
Syrian/Lebanese
|
2
|
0.70
|
|
2nd Class
|
Welsh
|
2
|
0.70
|
|
2nd Class
|
Australian
|
1
|
0.35
|
|
2nd Class
|
Belgian
|
1
|
0.35
|
|
2nd Class
|
German
|
1
|
0.35
|
|
2nd Class
|
Haitian
|
1
|
0.35
|
|
2nd Class
|
Hungarian
|
1
|
0.35
|
|
2nd Class
|
Japanese
|
1
|
0.35
|
|
2nd Class
|
Norwegian
|
1
|
0.35
|
|
2nd Class
|
Portuguese
|
1
|
0.35
|
|
3rd Class
|
English
|
112
|
15.80
|
|
3rd Class
|
Irish
|
105
|
14.81
|
|
3rd Class
|
Swedish
|
89
|
12.55
|
|
3rd Class
|
Syrian/Lebanese
|
83
|
11.71
|
|
3rd Class
|
Other - Multiple
|
69
|
9.73
|
|
3rd Class
|
Finnish
|
52
|
7.33
|
|
3rd Class
|
Bulgarian
|
33
|
4.65
|
|
3rd Class
|
Croatian
|
28
|
3.95
|
|
3rd Class
|
Norwegian
|
24
|
3.39
|
|
3rd Class
|
American
|
23
|
3.24
|
|
3rd Class
|
Belgian
|
22
|
3.10
|
|
3rd Class
|
Danish
|
7
|
0.99
|
|
3rd Class
|
Scottish
|
6
|
0.85
|
|
3rd Class
|
Welsh
|
6
|
0.85
|
|
3rd Class
|
Canadian
|
5
|
0.71
|
|
3rd Class
|
French
|
5
|
0.71
|
|
3rd Class
|
Polish
|
5
|
0.71
|
|
3rd Class
|
Swiss
|
5
|
0.71
|
|
3rd Class
|
Bosnian
|
4
|
0.56
|
|
3rd Class
|
Hong Kongese
|
4
|
0.56
|
|
3rd Class
|
Italian
|
4
|
0.56
|
|
3rd Class
|
Greek
|
3
|
0.42
|
|
3rd Class
|
Channel Islander
|
2
|
0.28
|
|
3rd Class
|
Chinese
|
2
|
0.28
|
|
3rd Class
|
German
|
2
|
0.28
|
|
3rd Class
|
Slovenian
|
2
|
0.28
|
|
3rd Class
|
Australian
|
1
|
0.14
|
|
3rd Class
|
Austrian
|
1
|
0.14
|
|
3rd Class
|
Latvian
|
1
|
0.14
|
|
3rd Class
|
Lithuanian
|
1
|
0.14
|
|
3rd Class
|
Portuguese
|
1
|
0.14
|
|
3rd Class
|
Sweedish
|
1
|
0.14
|
|
3rd Class
|
Turkish
|
1
|
0.14
|
# Average age by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
group_by(class) %>%
summarize(avg_age = mean(age), min_age = min(age), max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Average Age by Class
|
Class
|
Average Age
|
Minimum Age
|
Maximum Age
|
|
1st Class
|
39.12
|
0
|
71
|
|
2nd Class
|
30.01
|
0
|
71
|
|
3rd Class
|
25.12
|
0
|
74
|